library(aplore3)
library(ggplot2)
library(plotly)
library(RColorBrewer)
library(pheatmap)
library(cluster)
library(ggcorrplot)
library(dplyr)
library(tidyr)
Data Format: A data.frame with 500 rows and 18 variables such as:
priorfrac - If the patient previously had a fracture
age
weight
height
bmi
premeno
momfrac
armassist
smoke
raterisk
fracscore
fracture
bonemed - Bone
medications at enrollment (1: No, 2: Yes)
bonemed_fu - Bone
medications at follow-up (1: No, 2: Yes)
bonetreat - Bone
medications both at enrollment and follow-up (1: No, 2: Yes)
head(glow_bonemed)
## sub_id site_id phy_id priorfrac age weight height bmi premeno momfrac
## 1 1 1 14 No 62 70.3 158 28.16055 No No
## 2 2 4 284 No 65 87.1 160 34.02344 No No
## 3 3 6 305 Yes 88 50.8 157 20.60936 No Yes
## 4 4 6 309 No 82 62.1 160 24.25781 No No
## 5 5 1 37 No 61 68.0 152 29.43213 No No
## 6 6 5 299 Yes 67 68.0 161 26.23356 No No
## armassist smoke raterisk fracscore fracture bonemed bonemed_fu bonetreat
## 1 No No Same 1 No No No No
## 2 No No Same 2 No No No No
## 3 Yes No Less 11 No No No No
## 4 No No Less 5 No No No No
## 5 No No Same 1 No No No No
## 6 No Yes Same 4 No No No No
mysummary = glow_bonemed %>%
select(age, weight, height, bmi, fracscore) %>% # select variables to summarise
summarise_each(funs(min = min,
q25 = quantile(., 0.25),
median = median,
q75 = quantile(., 0.75),
max = max,
mean = mean,
sd = sd,
variance= var))
# reshape it using tidyr functions
clean.summary = mysummary %>% gather(stat, val) %>%
separate(stat, into = c("var", "stat"), sep = "_") %>%
spread(stat, val) %>%
select(var, min, max, mean, sd,variance) # reorder columns
print(clean.summary)
## var min max mean sd variance
## 1 age 55.00000 90.00000 68.56200 8.989537 80.811780
## 2 bmi 14.87637 49.08241 27.55303 5.973958 35.688178
## 3 fracscore 0.00000 11.00000 3.69800 2.495446 6.227251
## 4 height 134.00000 199.00000 161.36400 6.355493 40.392289
## 5 weight 39.90000 127.00000 71.82320 16.435992 270.141825
summary(glow_bonemed %>% select(priorfrac, premeno, momfrac, armassist, smoke, raterisk, fracture, bonemed, bonemed_fu, bonetreat))
## priorfrac premeno momfrac armassist smoke raterisk fracture
## No :374 No :403 No :435 No :312 No :465 Less :167 No :375
## Yes:126 Yes: 97 Yes: 65 Yes:188 Yes: 35 Same :186 Yes:125
## Greater:147
## bonemed bonemed_fu bonetreat
## No :371 No :361 No :382
## Yes:129 Yes:139 Yes:118
##
Age vs Weight: As weight increases the average age decreases
Age
vs Height: Weak correlation of as height increases age decreases
Age
vs BMI: As bmi increases the average age decreases
Age vs fracscore:
As age increases the average fracscore increases
Weight vs Height: As height increases the average weight
increases
Weight vs BMI: As bmi increases the average weight
increases
Weight vs fracscore: As fracscore increases the average
Weight decreases
Height vs BMI: As bmi increases the average height and variance stay
the same
Height vs fracscore: As fracscore increases the average
height stays the same though variance might decrease
BMI vs fracscore: As fracscore increases the average bmi
decreases
plot(glow_bonemed[, c(5:8, 14)])
Non of the following scatter plots show strong groupings for the
fracture/no fracture categorical variable
ggplot(glow_bonemed, aes(x = age, y = bmi, color = fracture)) +
geom_jitter()+
labs(title = "BMI vs Age")
ggplot(glow_bonemed, aes(x = bmi, y = fracscore, color = fracture)) +
geom_jitter()+
labs(title = "Fracture Score vs BMI")
ggplot(glow_bonemed, aes(x = fracscore, y = age, color = fracture)) +
geom_jitter()+
labs(title = "Age vs Fracture Score")
ggplot(glow_bonemed, aes(x = weight, y = height, color = fracture)) +
geom_jitter()+
labs(title = "Height vs Weight")
Once again there doesn’t seem to be strong groupings of the fracture categorical variable
fracture3dplot = plot_ly(glow_bonemed,
x = ~age,
y = ~height,
z = ~bmi,
color = ~fracture,
colors = c('#0C4B8E', '#BF382A')) %>% add_markers()
fracture3dplot
## `geom_smooth()` using formula = 'y ~ x'
The boxplot shows that the mean fracscore seems to be slightly higher for smokers compared to non smokers
ggplot(glow_bonemed, aes(x = smoke, y = fracscore)) +
geom_boxplot()+
labs(title = "Fracture Score Summary Statistics for Smokers vs Non Smokers")
corr_vars <- c("age", "weight", "height", "bmi", "fracscore")
corr_df <- glow_bonemed[, corr_vars]
corr_df <- cor(corr_df)
ggcorrplot(corr = corr_df, lab = TRUE, lab_size = 2,
colors = c("#6D9EC1", "white", "#E46726")) +
labs(title = "Correlation Between Variables") +
theme(plot.title = element_text(hjust = .5),
plot.subtitle = element_text(hjust = .5))
pheatmap(glow_bonemed[, c(5,8)], scale = "column", fontsize_row = 0.1, cluster_cols = F, legend = T, color = colorRampPalette(c("blue", "white", "red"), space = "rgb")(100))
pheatmap(glow_bonemed[, 5:8], scale = "column", fontsize_row = 0.1, cluster_cols = F, legend = T, color = colorRampPalette(c("blue", "white", "red"), space = "rgb")(100))
zScoreScale = scale(glow_bonemed[, 5:8])
zScoreDistance = dist(zScoreScale)
continuousVariableClustering = hclust(zScoreDistance, method = "complete")
plot(continuousVariableClustering)
corr_vars <- c("age", "weight", "height", "bmi", "fracscore")
pc.result<-prcomp(glow_bonemed[, corr_vars],scale.=TRUE)
#Eigen Vectors
pc.result$rotation
## PC1 PC2 PC3 PC4 PC5
## age 0.4947219 0.46742140 -0.15246583 0.71654567 -0.009160237
## weight -0.5273035 0.46578775 -0.08840991 0.03240244 -0.704362523
## height -0.2345770 -0.08196149 -0.93823245 0.01885633 0.240042129
## bmi -0.4741030 0.51615173 0.24533677 0.05137380 0.667820563
## fracscore 0.4442985 0.53984137 -0.16872342 -0.69463484 0.013601399
#Eigen Values
eigenvals<-pc.result$sdev^2
eigenvals
## [1] 2.374116591 1.523507236 0.975710317 0.123469514 0.003196342
#Scree plot
par(mfrow=c(1,2))
plot(eigenvals,type="l",main="Scree Plot",ylab="Eigen Values",xlab="PC #")
plot(eigenvals/sum(eigenvals),type="l",main="Scree Plot",ylab="Prop. Var. Explained",xlab="PC #",ylim=c(0,1))
cumulative.prop<-cumsum(eigenvals/sum(eigenvals))
lines(cumulative.prop,lty=2)
#Sources:
Hosmer, D.W., Lemeshow, S. and Sturdivant, R.X. (2013)
Applied Logistic Regression, 3rd ed., New York: Wiley
https://cran.r-project.org/web/packages/aplore3/aplore3.pdf#page=11&zoom=100,132,90